Analysis: CrispRVariants

Brzezinski - Purvis Dev paper CRISPR editing outcomes

Author
Affiliation

Michael Kaufman, PhD

1 Notes

This document is part of the Purvis Dev 2025 CRISPR editing outcomes analysis. It uses the CrispRVariants package to analyze CRISPR editing outcomes from paired-end sequencing data.

The following external tools are used in this analysis and must be installed and available in your PATH: bwa mem is used for alignment, and CrispRVariants is used to analyze the editing outcomes. samtools is used for BAM file processing.

2 Environment

[code]
knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.align='center')

# Start timing the document render
start_time <- Sys.time()

2.1 Load libraries

[code]
library(CrispRVariants)
library(Rsamtools)
library(GenomicRanges)
library(GenomicFeatures)
library(Biostrings)
library(DT)
library(gridExtra)

source("scripts/qmd_embed_download.R")
source("scripts/renv_embed_download.R")

set.seed(87645893)

dir.create(params$outs_path, showWarnings = TRUE)

3 Sample metadata

[code]
# Load the sample metadata
sample_metadata <- read.csv(file.path(params$sample_metadata_file), header = TRUE)

datatable(sample_metadata)

4 CrispR editing outcomes analysis

4.1 Alignment - Paired-end sample processing

[code]
bwa_ref <- params$ref_genome
raw_data_dir <- params$data_path
bam_dir <- params$processed_data_path

# Get all fastq files in the raw data directory
fq_fnames <- list.files(raw_data_dir, pattern = "fastq.gz$", full.names = TRUE)

# For paired-end reads, group files by sample name (handling _R1_001/_R2_001 naming)
# Extract sample names by removing _R1_001/_R2_001 suffixes
sample_names <- unique(gsub("_R[12]_001", "", basename(fq_fnames)))
sample_names <- gsub("\\.fastq\\.gz$", "", sample_names)

bam_dir_full <- file.path(bam_dir)
dir.create(bam_dir_full, showWarnings = FALSE, recursive = TRUE)

# Process each sample
for(sample in sample_names) {
  # Find R1 and R2 files for this sample
  r1_file <- fq_fnames[grepl(paste0(sample, "_R1_001"), basename(fq_fnames))]
  r2_file <- fq_fnames[grepl(paste0(sample, "_R2_001"), basename(fq_fnames))]
  
  # Check if we have both R1 and R2
  if(length(r1_file) == 1 && length(r2_file) == 1) {
    # Define file names explicitly
    unsorted_bam <- file.path(bam_dir_full, paste0(sample, "_unsorted.bam"))
    sorted_bam <- file.path(bam_dir_full, paste0(sample, "_sorted.bam"))
    
    # Skip if already processed
    if(!file.exists(paste0(sorted_bam, ".bai"))) {
      cmd <- paste0("bwa mem -t ", params$bwa_threads, " ", bwa_ref, " ", r1_file, " ", r2_file,
                    " | samtools view -Sb - > ", unsorted_bam)
      message("Processing paired-end sample: ", sample)
      message(cmd, "\n")
      system(cmd)
      
      # Sort and index - FIX: Remove .bam extension for sortBam()
      message("Sorting and indexing: ", basename(sorted_bam))
      sorted_bam_base <- tools::file_path_sans_ext(sorted_bam)  # Remove .bam extension
      sortBam(unsorted_bam, sorted_bam_base)  # sortBam will add .bam automatically
      indexBam(sorted_bam)  # Index the final .bam file
      unlink(unsorted_bam)  # Remove unsorted BAM
      
      message("Created: ", sorted_bam)
    } else {
      message("Already processed: ", sample)
    }
    
  } else {
    warning("Could not find proper R1/R2 files for sample: ", sample)
    message("R1 file found: ", length(r1_file), " - ", ifelse(length(r1_file) > 0, r1_file, "none"))
    message("R2 file found: ", length(r2_file), " - ", ifelse(length(r2_file) > 0, r2_file, "none"))
  }
}

# Get list of sorted BAM files for downstream analysis
sorted_bam_files <- list.files(bam_dir_full, pattern = "_sorted\\.bam$", full.names = TRUE)
message("Found ", length(sorted_bam_files), " sorted BAM files:")
for(bam_file in sorted_bam_files) {
  message("  - ", bam_file)
  message("    Index exists: ", file.exists(paste0(bam_file, ".bai")))
}

4.2 Create the target location and reference sequence

[code]
# Define the target location and reference sequence as granges
target_location <- GRanges(
  seqnames = params$target_chr,
  ranges = IRanges(start = params$target_start, end = params$target_end),
  strand = params$target_strand
)

# Extract reference sequence
reference_raw <- system(sprintf("samtools faidx %s %s:%s-%s",
                               params$ref_genome,
                               seqnames(target_location)[1], 
                               start(target_location)[1], 
                               end(target_location)[1]),
                       intern = TRUE)[[2]]

# Convert to DNAString and reverse complement
reference <- Biostrings::reverseComplement(Biostrings::DNAString(reference_raw))

# Check lengths match
target_width <- width(target_location)
ref_length <- nchar(reference_raw)

message("Target width: ", target_width)
message("Reference length: ", ref_length)
message("Reference sequence: ", reference_raw)

# If they don't match, adjust the target to match the reference
if(target_width != ref_length) {
  message("Adjusting target location to match reference sequence length")
  target_location <- GRanges(
    seqnames = params$target_chr,
    ranges = IRanges(start = params$target_start, end = params$target_start + ref_length - 1),
    strand = params$target_strand
  )
  message("New target width: ", width(target_location))
}

4.3 CrisprSet object

This object is created from the sorted BAM files and the target location. It will contain the read counts and variant information for each sample at the specified target region.

[code]
# Grab sorted bams from processed_data using the exact same pattern
sorted_bam_files <- list.files(params$processed_data_path, pattern = "_sorted\\.bam$", full.names = TRUE)

if(length(sorted_bam_files) == 0) {
  stop("No sorted BAM files found in ", params$processed_data_path, ". Check alignment step.")
}

message("BAM files found for CrisprSet:")
for(bam_file in sorted_bam_files) {
  message("  - ", basename(bam_file))
}

bam_sample_names <- gsub("_sorted\\.bam$", "", basename(sorted_bam_files))
message("Sample names: ", paste(bam_sample_names, collapse = ", "))

# Check all files exist and are indexed
for(bam_file in sorted_bam_files) {
  if(!file.exists(bam_file)) {
    stop("BAM file not found: ", bam_file)
  }
  if(!file.exists(paste0(bam_file, ".bai"))) {
    message("Indexing missing for: ", basename(bam_file))
    indexBam(bam_file)
  }
}

crispr_set <- readsToTarget(sorted_bam_files, 
                           target = target_location, 
                           reference = reference,
                           names = bam_sample_names, 
                           target.loc = params$target_loc_forward)

DT::datatable(variantCounts(crispr_set))
[code]
DT::datatable(as.data.frame(consensusSeqs(crispr_set)))

4.4 Variant plots

This section creates variant plots for the CRISPR editing outcomes. It uses the CrispRVariants package to visualize the variants detected in the target region across samples.

[code]
gtf_fname <- params$gtf_file

# Check if GTF file exists
if(file.exists(gtf_fname)) {
  message("Creating TxDb from GTF file...")
  txdb <- GenomicFeatures::makeTxDbFromGFF(gtf_fname, format = "gtf")
  
  # Check if there are any genes in our target region
  target_genes <- genes(txdb, filter = list(tx_chrom = seqnames(target_location)[1]))
  overlapping_genes <- subsetByOverlaps(target_genes, target_location)
  
  if(length(overlapping_genes) == 0) {
    message("No genes found overlapping target region. Proceeding without gene annotation.")
    txdb <- NULL
  } else {
    message("Found ", length(overlapping_genes), " genes overlapping target region")
    print(overlapping_genes)
  }
} else {
  message("GTF file not found at: ", gtf_fname)
  message("Proceeding without gene annotation")
  txdb <- NULL
}
[code]
group <- sample_metadata$treatment

plotVariants(crispr_set, txdb = txdb, gene.text.size = 8, 
    row.ht.ratio = c(1,8), col.wdth.ratio = c(4,2),
    plotAlignments.args = list(line.weight = 0.5, ins.size = 2, 
                               legend.symbol.size = 4),
    plotFreqHeatmap.args = list(plot.text.size = 3, x.size = 8, group = group, 
                                legend.text.size = 8, 
                                legend.key.height = grid::unit(0.5, "lines"))) 

TableGrob (2 x 1) "arrange": 2 grobs
  z     cells    name               grob
1 1 (1-1,1-1) arrange rect[GRID.rect.11]
2 2 (2-2,1-1) arrange    gtable[arrange]
[code]
pdf("variant_plot.pdf", width = 8, height = 6)
plotVariants(crispr_set)
TableGrob (2 x 1) "arrange": 2 grobs
  z     cells    name                grob
1 1 (1-1,1-1) arrange rect[GRID.rect.132]
2 2 (2-2,1-1) arrange     gtable[arrange]
[code]
dev.off()
quartz_off_screen 
                2 

4.5 Mutation efficiency

[code]
eff <- mutationEfficiency(crispr_set, filter.cols = "ControlDHS15primers", exclude.cols = "ControlDHS15primers")
eff
ControlDHS2primers   TestDHS15primers            Average             Median 
              2.15               3.34               2.75               2.75 
           Overall              StDev          ReadCount 
              3.34            3621.80          154247.00 
[code]
crispr_set_rev <-  readsToTarget(sorted_bam_files, target = target_location, reference = reference,
                            names = bam_sample_names, target.loc = params$target_loc_reverse, 
                                orientation = "opposite")
plotVariants(crispr_set_rev)

TableGrob (2 x 1) "arrange": 2 grobs
  z     cells    name                grob
1 1 (1-1,1-1) arrange rect[GRID.rect.253]
2 2 (2-2,1-1) arrange     gtable[arrange]

4.6 Calculate frequency of each variant and total variants

[code]
# Calculate frequency of each variant
variant_freq <- variantFrequency(crispr_set, filter.cols = "ControlDHS15primers", exclude.cols = "ControlDHS15primers")
datatable(variant_freq)

5 Reproducibility

5.1 Code document - quarto

The code in quarto document format is included below which can be used to render and rerun this analysis workflow in its entirety. It can be rendered with the quarto publishing system running in the R programming environment in the renv section. You can download the file using the button below.

5.2 R programming environment - renv

This project uses renv to manage package dependencies. renv is an R package that helps create and manage project-specific environments, ensuring that the correct versions of packages are used for reproducibility. To restore the environment, you would typically run renv::restore() in your R session, which will install the packages specified in the renv.lock file. You can download the renv.lock file below to restore the environment.

The current status of the environment is as follows:

[code]
renv::status()
No issues found -- the project is in a consistent state.

6 Session info

[code]
sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.5

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Denver
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] gridExtra_2.3          DT_0.33                GenomicFeatures_1.58.0
 [4] AnnotationDbi_1.68.0   Biobase_2.66.0         Rsamtools_2.22.0      
 [7] Biostrings_2.74.1      XVector_0.46.0         GenomicRanges_1.58.0  
[10] GenomeInfoDb_1.42.3    IRanges_2.40.1         S4Vectors_0.44.0      
[13] BiocGenerics_0.52.0    CrispRVariants_1.34.0  ggplot2_3.5.2         

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1            dplyr_1.1.4                
 [3] farver_2.1.2                blob_1.2.4                 
 [5] bitops_1.0-9                fastmap_1.2.0              
 [7] RCurl_1.98-1.17             GenomicAlignments_1.42.0   
 [9] XML_3.99-0.18               digest_0.6.36              
[11] lifecycle_1.0.4             KEGGREST_1.46.0            
[13] RSQLite_2.4.2               magrittr_2.0.3             
[15] compiler_4.4.2              sass_0.4.9                 
[17] rlang_1.1.4                 tools_4.4.2                
[19] yaml_2.3.9                  rtracklayer_1.66.0         
[21] knitr_1.48                  labeling_0.4.3             
[23] S4Arrays_1.6.0              htmlwidgets_1.6.4          
[25] bit_4.6.0                   curl_6.4.0                 
[27] DelayedArray_0.32.0         plyr_1.8.9                 
[29] RColorBrewer_1.1-3          abind_1.4-8                
[31] BiocParallel_1.40.2         withr_3.0.0                
[33] grid_4.4.2                  scales_1.4.0               
[35] SummarizedExperiment_1.36.0 cli_3.6.3                  
[37] rmarkdown_2.27              crayon_1.5.3               
[39] generics_0.1.4              httr_1.4.7                 
[41] reshape2_1.4.4              rjson_0.2.23               
[43] DBI_1.2.3                   cachem_1.1.0               
[45] stringr_1.5.1               zlibbioc_1.52.0            
[47] parallel_4.4.2              BiocManager_1.30.26        
[49] restfulr_0.0.16             base64enc_0.1-3            
[51] matrixStats_1.5.0           vctrs_0.6.5                
[53] Matrix_1.7-1                jsonlite_1.8.8             
[55] bit64_4.6.0-1               crosstalk_1.2.1            
[57] jquerylib_0.1.4             glue_1.7.0                 
[59] codetools_0.2-20            stringi_1.8.7              
[61] gtable_0.3.6                BiocIO_1.16.0              
[63] UCSC.utils_1.2.0            tibble_3.3.0               
[65] pillar_1.11.0               htmltools_0.5.8.1          
[67] GenomeInfoDbData_1.2.13     R6_2.5.1                   
[69] evaluate_0.24.0             lattice_0.22-6             
[71] png_0.1-8                   memoise_2.0.1              
[73] bslib_0.7.0                 renv_1.1.4                 
[75] Rcpp_1.0.12                 SparseArray_1.6.2          
[77] xfun_0.45                   MatrixGenerics_1.18.1      
[79] pkgconfig_2.0.3            

Document render completed at: 2025-07-31 10:04:31

Total render time: 1.24 minutes